{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "UR_uNtd16RlC"
   },
   "outputs": [],
   "source": [
    "!pip install -Uqqq pip --progress-bar off\n",
    "!pip install -qqq langchain==0.0.141 --progress-bar off\n",
    "!pip install -qqq openai==0.27.4 --progress-bar off\n",
    "!pip install -Uqqq watermark==2.3.1 --progress-bar off\n",
    "!pip install -Uqqq chromadb==0.3.21 --progress-bar off\n",
    "!pip install -Uqqq tiktoken==0.3.3 --progress-bar off\n",
    "!pip install -Uqqq youtube-transcript-api==0.5.0 --progress-bar off\n",
    "!pip install -Uqqq pytube==12.1.3 --progress-bar off\n",
    "!pip install -Uqqq unstructured[local-inference]==0.5.12 --progress-bar off"
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import os\n",
    "import textwrap\n",
    "\n",
    "import chromadb\n",
    "import langchain\n",
    "import openai\n",
    "from langchain.chains import RetrievalQA\n",
    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.document_loaders import TextLoader, UnstructuredPDFLoader, YoutubeLoader\n",
    "from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings\n",
    "from langchain.indexes import VectorstoreIndexCreator\n",
    "from langchain.llms import OpenAI\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from langchain.vectorstores import Chroma"
   ],
   "metadata": {
    "id": "qWD240smBU6z"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "%load_ext watermark\n",
    "%watermark --iversions -v -m"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "3FFyV3BNBSnd",
    "outputId": "24e25ebc-ba5c-445c-ff71-95658117db2e"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Python implementation: CPython\n",
      "Python version       : 3.9.16\n",
      "IPython version      : 7.34.0\n",
      "\n",
      "Compiler    : GCC 9.4.0\n",
      "OS          : Linux\n",
      "Release     : 5.10.147+\n",
      "Machine     : x86_64\n",
      "Processor   : x86_64\n",
      "CPU cores   : 2\n",
      "Architecture: 64bit\n",
      "\n",
      "chromadb : 0.3.21\n",
      "openai   : 0.27.4\n",
      "langchain: 0.0.141\n",
      "\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "def print_response(response: str):\n",
    "    print(\"\\n\".join(textwrap.wrap(response, width=100)))"
   ],
   "metadata": {
    "id": "GUqPyc20MxUh"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "os.environ[\"OPENAI_API_KEY\"] = \"YOUR OPENAI KEY\""
   ],
   "metadata": {
    "id": "C-6WIHwKJ-ev"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "!gdown 1eetuan04uj9-QKu_Vok2mbSK23G0H7yN\n",
    "!gdown 1MVIhlCJS5RjVDy_s93Jb4vkHt6jAmgaa"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "IV7etHSwC4jm",
    "outputId": "d1cbc8dc-138e-4a32-836b-1d964f69fa0e"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Downloading...\n",
      "From: https://drive.google.com/uc?id=1MVIhlCJS5RjVDy_s93Jb4vkHt6jAmgaa\n",
      "To: /content/Andrej_Karpathy_Resume.pdf\n",
      "\r  0% 0.00/46.9k [00:00<?, ?B/s]\r100% 46.9k/46.9k [00:00<00:00, 48.9MB/s]\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "txt_loader = TextLoader(\"./the-need-to-read.txt\", encoding=\"utf8\")"
   ],
   "metadata": {
    "id": "O9URGZ4EJs8p"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "index = VectorstoreIndexCreator().from_loaders([txt_loader])"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "buDonlV1J5uT",
    "outputId": "2e8f4563-d085-4525-c562-87ee470625fa"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "WARNING:chromadb:Using embedded DuckDB without persistence: data will be transient\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "query = \"Why someone in todays world would read? Answer in 3 sentences.\"\n",
    "result = index.query_with_sources(query)\n",
    "result"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "W3Z7djTcMBJz",
    "outputId": "22794e99-0516-44b1-f8a9-f16f0ca916aa"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "{'question': 'Why someone in todays world would read? Answer in 3 sentences.',\n",
       " 'answer': ' Reading helps to develop critical thinking skills, encourages creativity, and allows for the discovery of new ideas. It also helps to develop writing skills, which is important for expressing and exploring ideas.\\n',\n",
       " 'sources': './the-need-to-read.txt'}"
      ]
     },
     "metadata": {},
     "execution_count": 27
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "print_response(result[\"answer\"])"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "wf0aLZD4MrDU",
    "outputId": "71e8cd56-72b6-417d-8360-dac56f678ef2"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      " Reading helps to develop critical thinking skills, encourages creativity, and allows for the\n",
      "discovery of new ideas. It also helps to develop writing skills, which is important for expressing\n",
      "and exploring ideas.\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Loaders"
   ],
   "metadata": {
    "id": "oBPR2tNB7La6"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "yt_loader = YoutubeLoader.from_youtube_url(\n",
    "    \"https://www.youtube.com/watch?v=n2uY3-2Goek\", add_video_info=True\n",
    ")"
   ],
   "metadata": {
    "id": "TlIgJ0hsD1gX"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "yt_documents = yt_loader.load()\n",
    "yt_documents"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Q626C3NND5fh",
    "outputId": "def93b6b-17f5-42a3-fdcf-8ec3b90f5063"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "[Document(page_content=\"you get more out of reading one book that's great five times than out of reading five mediocre books if your behavior doesn't change as a result of reading a book and it means you learn nothing which means it was a waste of time and many people who read books are just wasting their time because their behavior doesn't change and so I consolidate once I find something that's good I plug everything I possibly can into it and suck the juice out of it so that I can change my behavior as a result which comes from the Frameworks and how I think about it so I read one thing that's very good many times rather than trying to brag about the fact that I read a book a week because I'm like what was the book last week not that good that it wasn't worth rereading\", metadata={'source': 'n2uY3-2Goek', 'title': 'How to get the most out of reading', 'description': \"WE'RE BUYING! $1M-10M EBITDA Founders - We invest and help you scale faster. To find out more, apply here: https://acquisition.com \\n\\nWe invest in everything from youtube channels to local businesses to IT services. \\n\\nFor everyone else, I make my money buying and growing businesses. I make this free content with the hopes you use it to grow your business enough to partner with us.\", 'view_count': 30422, 'thumbnail_url': 'https://i.ytimg.com/vi/n2uY3-2Goek/sd2.jpg?sqp=-oaymwEoCIAFEOAD8quKqQMcGADwAQH4Ac4FgAKACooCDAgAEAEYQCBCKHIwDw==&rs=AOn4CLDKCjPeWZV37u0ZJ3retgKQYsml2w', 'publish_date': datetime.datetime(2022, 11, 15, 0, 0), 'length': 33, 'author': 'Alex Hormozi'})]"
      ]
     },
     "metadata": {},
     "execution_count": 39
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "document = yt_documents[0]\n",
    "document.page_content"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 125
    },
    "id": "Ca9PaoBUECde",
    "outputId": "d63ad7b5-1252-4961-9464-1a9f58d69008"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "\"you get more out of reading one book that's great five times than out of reading five mediocre books if your behavior doesn't change as a result of reading a book and it means you learn nothing which means it was a waste of time and many people who read books are just wasting their time because their behavior doesn't change and so I consolidate once I find something that's good I plug everything I possibly can into it and suck the juice out of it so that I can change my behavior as a result which comes from the Frameworks and how I think about it so I read one thing that's very good many times rather than trying to brag about the fact that I read a book a week because I'm like what was the book last week not that good that it wasn't worth rereading\""
      ],
      "application/vnd.google.colaboratory.intrinsic+json": {
       "type": "string"
      }
     },
     "metadata": {},
     "execution_count": 40
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "# Use OnlinePDFLoade to load PDFs from the Internets\n",
    "pdf_loader = UnstructuredPDFLoader(\"./Andrej_Karpathy_Resume.pdf\")\n",
    "pdf_pages = pdf_loader.load_and_split()"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "062XoRf2Uhcp",
    "outputId": "5e733687-e107-4e27-bdac-52073b31c0ea"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "WARNING:unstructured:detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "len(pdf_pages[0].page_content)"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "ReR1tt38kDDz",
    "outputId": "fc56c30f-0757-414b-9930-e6ed85427099"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "1434"
      ]
     },
     "metadata": {},
     "execution_count": 22
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "pdf_pages[0]"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "9radjxmDa__J",
    "outputId": "944ed615-7f1c-413a-b952-af48562ee2ce"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "Document(page_content='Andrej Karpathy\\n\\nandrej.karpathy@gmail.com\\n\\nhttp://cs.stanford.edu/~karpathy/\\n\\nEDUCATION\\n\\nStanford University (PhD), 2011 –\\n\\nComputer Science, studying Machine Learning and Computer Vision\\n\\nUniversity of British Columbia (Master’s degree), 2009 - 2011\\n\\nComputer Science graduate studies in Machine Learning, Vision, Motor Control\\n\\nAverage course grade: 94.4%\\n\\nUniversity of Toronto (Bachelor’s degree), 2005 - 2009\\n\\nDouble major in Computer Science and Physics, minor in Mathematics\\n\\nCumulative GPA: 3.74\\n\\nWORK EXPERIENCE\\n\\nGoogle Research (internship), June 2011 – September 2011\\n\\nDeveloped learning algorithms for video classification tasks\\n\\nWorked on a large-scale learning framework for video analysis\\n\\nTeaching Assistant\\n\\n\\n\\n\\n\\n\\n\\n2011: Assisted with online offering of the Machine Learning class at Stanford\\n\\n2011: Graduate Probabilistic Machine Learning class\\n\\n2009-2010: Taught tutorial sections for a first year Discrete Mathematics class on\\n\\nfour consecutive semesters\\n\\nCOURSE WORK\\n\\n\\n\\nStanford: Machine Learning, Computer Vision, Convex Optimization,\\n\\nProbabilistic Graphical Models (I and II)\\n\\nUniversity of British Columbia: Machine Learning (I and II), Computer Vision (I\\n\\nand II)\\n\\nHACKING SKILLS\\n\\n\\n\\nPython, C++, MATLAB, Java, Objective C, Javascript/HTML/CSS, PHP, SQL\\n\\nINTERESTS\\n\\nHobbies include Ping Pong, Ice skating, Scuba diving, PC strategy/fps games,\\n\\nProgramming, and solving the Rubik’s cube in less than 20 seconds', metadata={'source': './Andrej_Karpathy_Resume.pdf'})"
      ]
     },
     "metadata": {},
     "execution_count": 13
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Text Splitters"
   ],
   "metadata": {
    "id": "sGk3kqP67M7y"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)\n",
    "texts = text_splitter.split_documents(pdf_pages)\n",
    "len(texts)"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "FdUOyWP1iFNf",
    "outputId": "c2f8406f-de0c-4eb4-a930-2394fee20d96"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "metadata": {},
     "execution_count": 48
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "len(texts[0].page_content), len(texts[1].page_content)"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "sH1j_Gosj2j1",
    "outputId": "d9a2a64f-a400-490f-9dfa-ed57eb26a099"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "(987, 486)"
      ]
     },
     "metadata": {},
     "execution_count": 49
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "texts[0]"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Zz8mt_iciQO8",
    "outputId": "48aefe51-7eed-435b-9a01-bc214e9c6b29"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "Document(page_content='Andrej Karpathy\\n\\nandrej.karpathy@gmail.com\\n\\nhttp://cs.stanford.edu/~karpathy/\\n\\nEDUCATION\\n\\nStanford University (PhD), 2011 –\\n\\nComputer Science, studying Machine Learning and Computer Vision\\n\\nUniversity of British Columbia (Master’s degree), 2009 - 2011\\n\\nComputer Science graduate studies in Machine Learning, Vision, Motor Control\\n\\nAverage course grade: 94.4%\\n\\nUniversity of Toronto (Bachelor’s degree), 2005 - 2009\\n\\nDouble major in Computer Science and Physics, minor in Mathematics\\n\\nCumulative GPA: 3.74\\n\\nWORK EXPERIENCE\\n\\nGoogle Research (internship), June 2011 – September 2011\\n\\nDeveloped learning algorithms for video classification tasks\\n\\nWorked on a large-scale learning framework for video analysis\\n\\nTeaching Assistant\\n\\n\\n\\n\\n\\n\\n\\n2011: Assisted with online offering of the Machine Learning class at Stanford\\n\\n2011: Graduate Probabilistic Machine Learning class\\n\\n2009-2010: Taught tutorial sections for a first year Discrete Mathematics class on\\n\\nfour consecutive semesters\\n\\nCOURSE WORK', metadata={'source': './Andrej_Karpathy_Resume.pdf'})"
      ]
     },
     "metadata": {},
     "execution_count": 50
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "texts[1]"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "kfXXJ1ZviSpM",
    "outputId": "c8492d7e-2d9e-406c-d57a-4363244f4fa8"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "Document(page_content='four consecutive semesters\\n\\nCOURSE WORK\\n\\n\\n\\nStanford: Machine Learning, Computer Vision, Convex Optimization,\\n\\nProbabilistic Graphical Models (I and II)\\n\\nUniversity of British Columbia: Machine Learning (I and II), Computer Vision (I\\n\\nand II)\\n\\nHACKING SKILLS\\n\\n\\n\\nPython, C++, MATLAB, Java, Objective C, Javascript/HTML/CSS, PHP, SQL\\n\\nINTERESTS\\n\\nHobbies include Ping Pong, Ice skating, Scuba diving, PC strategy/fps games,\\n\\nProgramming, and solving the Rubik’s cube in less than 20 seconds', metadata={'source': './Andrej_Karpathy_Resume.pdf'})"
      ]
     },
     "metadata": {},
     "execution_count": 51
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Embeddings"
   ],
   "metadata": {
    "id": "iADFckWU7kjl"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "MODEL_NAME = \"sentence-transformers/all-mpnet-base-v2\"\n",
    "hf_embeddings = HuggingFaceEmbeddings(model_name=MODEL_NAME)"
   ],
   "metadata": {
    "id": "PTJdcEPFzGPw"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "text = texts[0].page_content\n",
    "text"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 161
    },
    "id": "ZsvwTQSC0nPK",
    "outputId": "0daaffba-0359-46c6-b6c4-0580e9ec15e0"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "'Andrej Karpathy\\n\\nandrej.karpathy@gmail.com\\n\\nhttp://cs.stanford.edu/~karpathy/\\n\\nEDUCATION\\n\\nStanford University (PhD), 2011 –\\n\\nComputer Science, studying Machine Learning and Computer Vision\\n\\nUniversity of British Columbia (Master’s degree), 2009 - 2011\\n\\nComputer Science graduate studies in Machine Learning, Vision, Motor Control\\n\\nAverage course grade: 94.4%\\n\\nUniversity of Toronto (Bachelor’s degree), 2005 - 2009\\n\\nDouble major in Computer Science and Physics, minor in Mathematics\\n\\nCumulative GPA: 3.74\\n\\nWORK EXPERIENCE\\n\\nGoogle Research (internship), June 2011 – September 2011\\n\\nDeveloped learning algorithms for video classification tasks\\n\\nWorked on a large-scale learning framework for video analysis\\n\\nTeaching Assistant\\n\\n\\n\\n\\n\\n\\n\\n2011: Assisted with online offering of the Machine Learning class at Stanford\\n\\n2011: Graduate Probabilistic Machine Learning class\\n\\n2009-2010: Taught tutorial sections for a first year Discrete Mathematics class on\\n\\nfour consecutive semesters\\n\\nCOURSE WORK'"
      ],
      "application/vnd.google.colaboratory.intrinsic+json": {
       "type": "string"
      }
     },
     "metadata": {},
     "execution_count": 120
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "hf_embedding = hf_embeddings.embed_documents([text])\n",
    "len(hf_embedding[0])"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "xaxpig8Vzl9X",
    "outputId": "09e3fb26-a2cf-4846-ccf0-bcb9bae360e9"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "768"
      ]
     },
     "metadata": {},
     "execution_count": 115
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "hf_embedding[0][:10]"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "hoaqgGy20ICi",
    "outputId": "ccd778b7-e8c8-4f2f-dd21-33720df2d193"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "[-0.0012547640362754464,\n",
       " 0.05444266274571419,\n",
       " -0.041984450072050095,\n",
       " -0.019023854285478592,\n",
       " 0.007353615947067738,\n",
       " -0.012013374827802181,\n",
       " 0.06387557089328766,\n",
       " -0.02246193215250969,\n",
       " -0.04335080459713936,\n",
       " -0.04206854850053787]"
      ]
     },
     "metadata": {},
     "execution_count": 116
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "embeddings = OpenAIEmbeddings()"
   ],
   "metadata": {
    "id": "OepRIjzu7POT"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "openai_embedding = embeddings.embed_documents([text])\n",
    "len(openai_embedding[0])"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "b1mJ8ksB0dCE",
    "outputId": "3296cd99-9966-42ff-ea81-b3a9388fdade"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "1536"
      ]
     },
     "metadata": {},
     "execution_count": 119
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "openai_embedding[0][:10]"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "s3x6CRko0hqC",
    "outputId": "a6939f92-8967-495b-cb93-859852cad02e"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "[-0.0034319146679993133,\n",
       " 0.016217479770247397,\n",
       " 0.020403068874950882,\n",
       " -0.03693009233481942,\n",
       " 0.01301435869943405,\n",
       " 0.025678797149630162,\n",
       " -0.00714645780273548,\n",
       " 0.017321074689020152,\n",
       " -0.03157361652884209,\n",
       " -0.020618405559186648]"
      ]
     },
     "metadata": {},
     "execution_count": 129
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Vectorstores"
   ],
   "metadata": {
    "id": "8vVyDPSB7mo-"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "db = Chroma.from_documents(texts, embeddings)"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "5eBmO0Wy7npk",
    "outputId": "41fa8d8a-cd51-482a-dc91-6f28c389eca6"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "WARNING:chromadb:Using embedded DuckDB without persistence: data will be transient\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "db.similarity_search_with_score(\"What is the candidate work experience?\", k=2)"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "FWDSsxEDpLE4",
    "outputId": "3ff57635-c131-4dc8-b4c3-1aee62a0aba5"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "[(Document(page_content='four consecutive semesters\\n\\nCOURSE WORK\\n\\n\\n\\nStanford: Machine Learning, Computer Vision, Convex Optimization,\\n\\nProbabilistic Graphical Models (I and II)\\n\\nUniversity of British Columbia: Machine Learning (I and II), Computer Vision (I\\n\\nand II)\\n\\nHACKING SKILLS\\n\\n\\n\\nPython, C++, MATLAB, Java, Objective C, Javascript/HTML/CSS, PHP, SQL\\n\\nINTERESTS\\n\\nHobbies include Ping Pong, Ice skating, Scuba diving, PC strategy/fps games,\\n\\nProgramming, and solving the Rubik’s cube in less than 20 seconds', metadata={'source': './Andrej_Karpathy_Resume.pdf'}),\n",
       "  0.4737962484359741),\n",
       " (Document(page_content='Andrej Karpathy\\n\\nandrej.karpathy@gmail.com\\n\\nhttp://cs.stanford.edu/~karpathy/\\n\\nEDUCATION\\n\\nStanford University (PhD), 2011 –\\n\\nComputer Science, studying Machine Learning and Computer Vision\\n\\nUniversity of British Columbia (Master’s degree), 2009 - 2011\\n\\nComputer Science graduate studies in Machine Learning, Vision, Motor Control\\n\\nAverage course grade: 94.4%\\n\\nUniversity of Toronto (Bachelor’s degree), 2005 - 2009\\n\\nDouble major in Computer Science and Physics, minor in Mathematics\\n\\nCumulative GPA: 3.74\\n\\nWORK EXPERIENCE\\n\\nGoogle Research (internship), June 2011 – September 2011\\n\\nDeveloped learning algorithms for video classification tasks\\n\\nWorked on a large-scale learning framework for video analysis\\n\\nTeaching Assistant\\n\\n\\n\\n\\n\\n\\n\\n2011: Assisted with online offering of the Machine Learning class at Stanford\\n\\n2011: Graduate Probabilistic Machine Learning class\\n\\n2009-2010: Taught tutorial sections for a first year Discrete Mathematics class on\\n\\nfour consecutive semesters\\n\\nCOURSE WORK', metadata={'source': './Andrej_Karpathy_Resume.pdf'}),\n",
       "  0.4828885793685913)]"
      ]
     },
     "metadata": {},
     "execution_count": 83
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Storing and Loading Embeddings"
   ],
   "metadata": {
    "id": "_yVUrEQOoyJo"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "PERSIST_DIRECTORY = \"db\"\n",
    "\n",
    "db = Chroma.from_documents(\n",
    "    documents=texts, embedding=embeddings, persist_directory=PERSIST_DIRECTORY\n",
    ")"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "PkWBekh9o8Ie",
    "outputId": "64aa811c-f9f8-4f32-975e-083455ae7834"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "WARNING:chromadb:Using embedded DuckDB with persistence: data will be stored in: db\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "db.similarity_search_with_score(\"What is the candidate work experience?\", k=2)"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "0DO8OgwXtU-L",
    "outputId": "c50133e2-f86b-4b5e-c521-a81b7bd02bc0"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "[(Document(page_content='four consecutive semesters\\n\\nCOURSE WORK\\n\\n\\n\\nStanford: Machine Learning, Computer Vision, Convex Optimization,\\n\\nProbabilistic Graphical Models (I and II)\\n\\nUniversity of British Columbia: Machine Learning (I and II), Computer Vision (I\\n\\nand II)\\n\\nHACKING SKILLS\\n\\n\\n\\nPython, C++, MATLAB, Java, Objective C, Javascript/HTML/CSS, PHP, SQL\\n\\nINTERESTS\\n\\nHobbies include Ping Pong, Ice skating, Scuba diving, PC strategy/fps games,\\n\\nProgramming, and solving the Rubik’s cube in less than 20 seconds', metadata={'source': './Andrej_Karpathy_Resume.pdf'}),\n",
       "  0.47347402572631836),\n",
       " (Document(page_content='Andrej Karpathy\\n\\nandrej.karpathy@gmail.com\\n\\nhttp://cs.stanford.edu/~karpathy/\\n\\nEDUCATION\\n\\nStanford University (PhD), 2011 –\\n\\nComputer Science, studying Machine Learning and Computer Vision\\n\\nUniversity of British Columbia (Master’s degree), 2009 - 2011\\n\\nComputer Science graduate studies in Machine Learning, Vision, Motor Control\\n\\nAverage course grade: 94.4%\\n\\nUniversity of Toronto (Bachelor’s degree), 2005 - 2009\\n\\nDouble major in Computer Science and Physics, minor in Mathematics\\n\\nCumulative GPA: 3.74\\n\\nWORK EXPERIENCE\\n\\nGoogle Research (internship), June 2011 – September 2011\\n\\nDeveloped learning algorithms for video classification tasks\\n\\nWorked on a large-scale learning framework for video analysis\\n\\nTeaching Assistant\\n\\n\\n\\n\\n\\n\\n\\n2011: Assisted with online offering of the Machine Learning class at Stanford\\n\\n2011: Graduate Probabilistic Machine Learning class\\n\\n2009-2010: Taught tutorial sections for a first year Discrete Mathematics class on\\n\\nfour consecutive semesters\\n\\nCOURSE WORK', metadata={'source': './Andrej_Karpathy_Resume.pdf'}),\n",
       "  0.4828885793685913)]"
      ]
     },
     "metadata": {},
     "execution_count": 88
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "db.persist()"
   ],
   "metadata": {
    "id": "EiRCXYiWpJFz"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "Load data"
   ],
   "metadata": {
    "id": "MN6cCrqkq_ac"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "vectordb = Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "JB6kMo0GqT1h",
    "outputId": "1d34d0c0-3b1d-4cef-c661-c20644e30080"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "WARNING:chromadb:Using embedded DuckDB with persistence: data will be stored in: db\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "vectordb.similarity_search_with_score(\"What is the candidate work experience?\", k=2)"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "7T0R9eofq5hN",
    "outputId": "735d3713-4bdd-496f-e324-790f24fc4518"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "[(Document(page_content='four consecutive semesters\\n\\nCOURSE WORK\\n\\n\\n\\nStanford: Machine Learning, Computer Vision, Convex Optimization,\\n\\nProbabilistic Graphical Models (I and II)\\n\\nUniversity of British Columbia: Machine Learning (I and II), Computer Vision (I\\n\\nand II)\\n\\nHACKING SKILLS\\n\\n\\n\\nPython, C++, MATLAB, Java, Objective C, Javascript/HTML/CSS, PHP, SQL\\n\\nINTERESTS\\n\\nHobbies include Ping Pong, Ice skating, Scuba diving, PC strategy/fps games,\\n\\nProgramming, and solving the Rubik’s cube in less than 20 seconds', metadata={'source': './Andrej_Karpathy_Resume.pdf'}),\n",
       "  0.47347402572631836),\n",
       " (Document(page_content='Andrej Karpathy\\n\\nandrej.karpathy@gmail.com\\n\\nhttp://cs.stanford.edu/~karpathy/\\n\\nEDUCATION\\n\\nStanford University (PhD), 2011 –\\n\\nComputer Science, studying Machine Learning and Computer Vision\\n\\nUniversity of British Columbia (Master’s degree), 2009 - 2011\\n\\nComputer Science graduate studies in Machine Learning, Vision, Motor Control\\n\\nAverage course grade: 94.4%\\n\\nUniversity of Toronto (Bachelor’s degree), 2005 - 2009\\n\\nDouble major in Computer Science and Physics, minor in Mathematics\\n\\nCumulative GPA: 3.74\\n\\nWORK EXPERIENCE\\n\\nGoogle Research (internship), June 2011 – September 2011\\n\\nDeveloped learning algorithms for video classification tasks\\n\\nWorked on a large-scale learning framework for video analysis\\n\\nTeaching Assistant\\n\\n\\n\\n\\n\\n\\n\\n2011: Assisted with online offering of the Machine Learning class at Stanford\\n\\n2011: Graduate Probabilistic Machine Learning class\\n\\n2009-2010: Taught tutorial sections for a first year Discrete Mathematics class on\\n\\nfour consecutive semesters\\n\\nCOURSE WORK', metadata={'source': './Andrej_Karpathy_Resume.pdf'}),\n",
       "  0.4828885793685913)]"
      ]
     },
     "metadata": {},
     "execution_count": 91
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "## Use a Chain"
   ],
   "metadata": {
    "id": "LGzDuztespYH"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "chain = RetrievalQA.from_chain_type(\n",
    "    llm=ChatOpenAI(temperature=0),\n",
    "    chain_type=\"stuff\",\n",
    "    retriever=db.as_retriever(search_kwargs={\"k\": 2}),\n",
    ")"
   ],
   "metadata": {
    "id": "aIoylGwastp8"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "query = \"What is the work experience of the candidate? Use no more than 2 sentences.\"\n",
    "response = chain.run(query)"
   ],
   "metadata": {
    "id": "B2LeOFLwtkho"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "print_response(response)"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "uYVgNvqxtqv2",
    "outputId": "a34f53b5-3fcb-49f1-f842-53ec3eba6618"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "The candidate has worked as a Google Research intern, developing learning algorithms for video\n",
      "classification tasks and worked on a large-scale learning framework for video analysis. They have\n",
      "also worked as a teaching assistant for various classes.\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "query = \"Give a background summary of the candidate. Use no more than 3 sentences.\"\n",
    "response = chain.run(query)\n",
    "print_response(response)"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "mR_yuCXVC5tV",
    "outputId": "f67c3286-f566-4bc2-97ca-7e2e440400ce"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Andrej Karpathy is a PhD student in Computer Science at Stanford University, studying Machine\n",
      "Learning and Computer Vision. He has a Master's degree in Computer Science from the University of\n",
      "British Columbia and a Bachelor's degree in Computer Science and Physics from the University of\n",
      "Toronto. He has worked as a Teaching Assistant and interned at Google Research.\n"
     ]
    }
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "query = \"\"\"\n",
    "How likely is this candidate to be a top-tier DL researcher 2 years from now? \n",
    "Use 0-10 scale, where\n",
    "0 - chance is nonexistent\n",
    "10 - beyond reasonable doubt\n",
    "\n",
    "You must choose a number and explain why\n",
    "\"\"\"\n",
    "response = chain.run(query)\n",
    "print_response(response)"
   ],
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "BbaGBZhMB2IC",
    "outputId": "20cfd89e-9de4-45ae-d4cf-cadd873a988e"
   },
   "execution_count": null,
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "As an AI language model, I cannot predict the future or make assumptions about individuals. However,\n",
      "based on the candidate's educational background, work experience, and course work, it seems that\n",
      "they have a strong foundation in machine learning and computer vision. Additionally, their\n",
      "experience as a teaching assistant and their interest in programming and problem-solving suggest\n",
      "that they have a passion for the field. Therefore, I would rate their chances of becoming a top-tier\n",
      "DL researcher 2 years from now as 7 out of 10.\n"
     ]
    }
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "## References\n",
    "\n",
    "- [Paul Graham - The Need to Read](http://www.paulgraham.com/read.html)\n",
    "- [Alex Hormozi - How to get the most out of reading](https://www.youtube.com/watch?v=n2uY3-2Goek)\n",
    "- [Andrej Karpathy (very old) Resume](https://cs.stanford.edu/~karpathy/Andrej_Karpathy_Resume.pdf)\n",
    "- [LangChain Docs](https://python.langchain.com/en/latest/index.html)"
   ],
   "metadata": {
    "id": "eljYTvb-AzSt"
   }
  }
 ]
}